In [1]:
import altair as alt
import eland as ed
import json
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch
alt.data_transformers.disable_max_rows()
Out[1]:
DataTransformerRegistry.enable('default')
In [2]:
df = pd.read_csv('../data/tirol_obituaries_deduped_weekly.csv', delimiter = ',')
In [3]:
df.head()
Out[3]:
district municipaly year week count
0 Lienz Matrei in Osttirol 2017 1 1
1 Lienz Matrei in Osttirol 2017 2 2
2 Lienz Matrei in Osttirol 2017 3 1
3 Lienz Matrei in Osttirol 2017 4 0
4 Lienz Matrei in Osttirol 2017 5 1
In [4]:
#http://www.statistik.at/wcm/idc/idcplg?IdcService=GET_NATIVE_FILE&RevisionSelectionMethod=LatestReleased&dDocName=122943
statistik_austria = pd.DataFrame({
    'week': [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1],
    'death_count': [160,153,152,136,130,128,137,123,117,119,132,133,138,136,123]
})
statistik_austria.head()
Out[4]:
week death_count
0 15 160
1 14 153
2 13 152
3 12 136
4 11 130

Outlier Detection

Running ./bin/parse.sh will create the source file for outlier detection: ./data/tirol_obituaries_deduped_weekly_outlier_detection_features.csv.

The columns of this file are: district,municipaly,year,week,count,yearly_max,weekly_max

yearly_max and weekly_max are the derived feature vectors. They are the difference between each rows count and the municipaly's maximum value of the row's week and the yearly maximum of that municipaly.

To calculate the outlier_score for each row, use the following configuration with Elasticsearch's Outlier Detection feature:

{
  "id": "tirol_outlier_high_count_2_1",
  "description": "",
  "source": {
    "index": [
      "tirol_outlier_high_count_2"
    ],
    "query": {
      "match_all": {}
    }
  },
  "dest": {
    "index": "tirol_outlier_high_count_dest_2_1",
    "results_field": "ml"
  },
  "analysis": {
    "outlier_detection": {
      "compute_feature_influence": true,
      "outlier_fraction": 0.05,
      "standardization_enabled": true
    }
  },
  "analyzed_fields": {
    "includes": [],
    "excludes": [
      "count",
      "week",
      "year"
    ]
  },
  "model_memory_limit": "2mb",
  "create_time": 1587934690989,
  "version": "8.0.0",
  "allow_lazy_start": false
}

To learn more about the feature head over to the docs here.

In [5]:
with open('elasticsearch-host', 'r') as file:
    es_host = file.read().replace('\n', '')

ed_outliers = ed.read_es(es_host, 'tirol_outlier_high_count_dest_3_1')
ed_outliers = ed_outliers[['district', 'municipaly', 'year', 'week', 'ml.outlier_score']]
df_outliers = ed.eland_to_pandas(ed_outliers)
df_outliers = df_outliers.rename(columns={'ml.outlier_score': 'outlier_score'})
df_outliers.to_csv('../data/tirol_obituaries_outliers_weekly.csv', index=False) 
df_outliers.head()
Out[5]:
district municipaly year week outlier_score
-4NzvXEBq9uwbHkGIe17 Kitzbühel Jochberg 2020 12 0.052896
-4NzvXEBq9uwbHkGIe57 Imst Längenfeld 2020 13 0.052896
-4NzvXEBq9uwbHkGIe97 Imst Nassereith 2020 14 0.052896
-4NzvXEBq9uwbHkGIep6 Reutte Steeg 2020 9 0.052896
-4NzvXEBq9uwbHkGIet6 Reutte Elbigenalp 2020 10 0.052896
In [6]:
alt.Chart(df_outliers, width=120, height=120).mark_bar().encode(
    alt.X("outlier_score:Q", bin=alt.Bin(maxbins=10)),
    y='count()',
    tooltip=['count()', 'outlier_score']
)
Out[6]:
In [7]:
# A dropdown filter
split = 'district'

outlier_distribution = alt.Chart(df_outliers, width=300)

points = outlier_distribution.mark_circle(opacity=0.9,size=20).encode(
    x='outlier_score:Q',
    y=split+':N',
    tooltip=['district', 'municipaly', 'week', 'outlier_score']
)

error_bars = outlier_distribution.mark_rule(opacity=0.2,strokeWidth=10).encode(
    x='ci0(outlier_score)',
    x2='ci1(outlier_score)',
    y=split+':N',
)

median_bars = outlier_distribution.mark_point(
    opacity=0.8,strokeWidth=1,size=200,shape='stroke',color='black',angle=90
).encode(
    x='median(outlier_score)',
    y=split+':N',
)

box_plot = outlier_distribution.mark_boxplot(outliers=False,median=False,opacity=0.5).encode(
    x='outlier_score:Q',
    y='district:N'
)


(box_plot + median_bars + points)
Out[7]:
In [8]:
alt.Chart(df_outliers, width=600).mark_boxplot().encode(
    x='outlier_score:Q'
)
Out[8]:
In [9]:
d_both = pd.merge(df, df_outliers, how='left', on=['district', 'municipaly', 'year', 'week'])
d_both.head()
Out[9]:
district municipaly year week count outlier_score
0 Lienz Matrei in Osttirol 2017 1 1 NaN
1 Lienz Matrei in Osttirol 2017 2 2 NaN
2 Lienz Matrei in Osttirol 2017 3 1 NaN
3 Lienz Matrei in Osttirol 2017 4 0 NaN
4 Lienz Matrei in Osttirol 2017 5 1 NaN
In [10]:
base = alt.Chart(df, width=470, height=200)

x_domain=(1,52)

week_cut_off='18'

week_point = base.transform_filter(
    'datum.year==2020 & datum.week<'+week_cut_off
).mark_circle(color='red', opacity=0, size=150).encode(
    alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
    alt.Y('sum(count)', title=''),
    tooltip=[
        alt.Tooltip('week:O', title="Kalenderwoche"),
        alt.Tooltip('sum(count):Q', title="Todesanzeigen")
    ]
)

week_historic_area = base.transform_filter(
    'datum.year!=2020 && datum.year!=2016'
).transform_joinaggregate(
    week_count='sum(count)',
    groupby=['year', 'week']
).mark_area(color='lightgray', opacity=0.5, strokeJoin='round').encode(
    alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
    alt.Y('max(week_count):Q', title=''),
    alt.Y2('min(week_count):Q', title='')
)

week_historic_mean = base.transform_filter(
    'datum.year!=2020 && datum.year!=2016'
).transform_joinaggregate(
    week_count='sum(count)',
    groupby=['year', 'week']
).mark_line(color='gray', opacity=0.25, strokeJoin='round').encode(
    alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
    alt.Y('mean(week_count):Q', title='')
)

week_current_line = base.transform_filter(
    'datum.year==2020 & datum.week<'+week_cut_off
).mark_line(strokeJoin='round').encode(
    alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
    alt.Y('sum(count):Q', title='', axis=alt.Axis(domainOpacity=0,ticks=False))
)

week_statistik_austria_line = alt.Chart(statistik_austria).mark_line(clip=True,color='orange', strokeJoin='round').encode(
    alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
    alt.Y('death_count:Q', title='', axis=alt.Axis(domainOpacity=0,ticks=False)),
    tooltip=[
        alt.Tooltip('week:O', title="Kalenderwoche"),
        alt.Tooltip('death_count:Q', title="Todesfälle")
    ]
)

week_statistik_austria_line_80 = alt.Chart(statistik_austria).transform_calculate(
    death_count_adjusted='datum.death_count*0.8'
).mark_line(clip=True,color='#ffbb78', strokeJoin='round').encode(
    alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
    alt.Y('death_count_adjusted:Q', title='', axis=alt.Axis(domainOpacity=0,ticks=False)),
    tooltip=[
        alt.Tooltip('week:O', title="Kalenderwoche"),
        alt.Tooltip('death_count_adjusted:Q', title="Todesfälle 80%")
    ]
)

chart_3_legendDomain = [
    'Statistik Austria 2020',
#     'Statistik Austria 2020 80%',
    'Todesanzeigen 2020',
    'Todesanzeigen 2017-2019 min/max',
    'Todesanzeigen 2017-2019 mean'
]
chart_3_legendColors = [
    'orange',
#     '#ffbb78',
    '#1f77b4',
    'lightgray',
    'gray'
]
chart_3_legendData = pd.DataFrame({
    'label': chart_3_legendDomain
})

legend3 = alt.Chart(chart_3_legendData).mark_square(size=150).encode(
    y=alt.Y(
        'label:N',
        axis=alt.Axis(domain=False, ticks=False, orient='right'),
        title=None
    ),
    color=alt.Color(
        'label',
        scale=alt.Scale(domain=chart_3_legendDomain,range=chart_3_legendColors),
        legend=None
    )
)

vega_week = ((
    week_historic_area +
    week_historic_mean +
    week_statistik_austria_line +
#     week_statistik_austria_line_80 +
    week_current_line +
    week_point
) | legend3).properties(
    title={
      "text": ["Todesanzeigen Tirol pro Woche im Jahresverlauf"],
    }
).configure_axis(
    grid=True,
    gridColor="#eee",
    domainColor="#ddd",
    tickColor="#ddd",
    labelColor="gray",
    labelBound=True,
).configure_view(
    strokeWidth=0
).configure_title(
    fontSize=14,
    fontWeight='bold',
    anchor='start',
    color='gray'
)

vega_week
Out[10]:
In [11]:
def district_chart(district='Kufstein'):
    d_df = df[df['district']==district]

    d_base = alt.Chart(d_df).properties(
        width=220,
        height=125
    )

    x_domain=(1,52)
    y_domain=(0,40)

    d_week_historic_area = d_base.transform_filter(
        'datum.year!=2020 && datum.year!=2016'
    ).mark_area(color='lightgray', opacity=0.5, strokeJoin='round').transform_joinaggregate(
        week_count='sum(count)',
        groupby=['year', 'week']
    ).encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('max(week_count):Q', title=''),
        alt.Y2('min(week_count):Q', title='')
    )

    d_week_historic_mean = d_base.transform_filter(
        'datum.year!=2020 && datum.year!=2016'
    ).mark_line(color='gray', opacity=0.25, strokeJoin='round').transform_joinaggregate(
        week_count='sum(count)',
        groupby=['year', 'week']
    ).encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('mean(week_count):Q', title='')
    )

    d_week_current_line = d_base.transform_filter(
        'datum.year==2020 & datum.week<'+week_cut_off
    ).mark_line(strokeJoin='round').encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('sum(count):Q', title='', axis=alt.Axis(domainOpacity=0,ticks=False))
    )

    d_week_point = d_base.transform_filter(
        'datum.year==2020 & datum.week<'+week_cut_off
    ).mark_circle(color='gray', opacity=0, size=150).encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('sum(count):Q', scale=alt.Scale(domain=y_domain,zero=True,nice=False), title=''),
        tooltip=[
            alt.Tooltip('week:O', title="Kalenderwoche"),
            alt.Tooltip('sum(count):Q', title="Todesanzeigen")
        ]
    )

    return (
        d_week_historic_area +
        d_week_historic_mean +
        d_week_current_line +
        d_week_point
    ).properties(
        title={
          "text": [district],
        }
    )

d_chart_3_legendDomain = [
    'Todesanzeigen 2020',
    'Todesanzeigen 2017-2019 min/max',
    'Todesanzeigen 2017-2019 mean'
]
d_chart_3_legendColors = ['#1f77b4', 'lightgray', 'gray']
d_chart_3_legendData = pd.DataFrame({
    'label': d_chart_3_legendDomain
})


d_legend3 = alt.Chart(d_chart_3_legendData).mark_square(size=150).encode(
    y=alt.Y(
        'label:N',
        axis=alt.Axis(domain=False, ticks=False, orient='right'),
        title=None
    ),
    color=alt.Color(
        'label',
        scale=alt.Scale(domain=d_chart_3_legendDomain,range=d_chart_3_legendColors),
        legend=None
    )
)

vega_week_district = (
    (
         district_chart('Imst') | district_chart('Innsbruck-Stadt') | district_chart('Innsbruck-Land')
    ) & (
        district_chart('Kitzbühel') | district_chart('Kufstein') | district_chart('Landeck')
    ) & (
        district_chart('Lienz') | district_chart('Reutte') | district_chart('Schwaz')
    ) | d_legend3
).configure_axis(
        grid=True,
        gridColor="#eee",
        domainColor="#ddd",
        tickColor="#ddd",
        labelColor="gray",
        labelBound=True,
    ).configure_view(
        strokeWidth=0
    ).configure_title(
        fontSize=14,
        fontWeight='normal',
        anchor='start',
        color='gray'
    )

vega_week_district
Out[11]:
In [12]:
def municipaly_chart(district='Kufstein'):
    d_both_district = d_both[d_both['district']==district]

    d_base = alt.Chart(d_both_district).properties(
        width=150,
        height=60
    )

    x_domain=(1,52)
    y_domain=(0,20)
    
    outlier_threshold='0.5'

    d_week_historic_area = d_base.transform_filter(
        'datum.year!=2020 && datum.year!=2016'
    ).transform_joinaggregate(
        week_count='sum(count)',
        groupby=['week','year']
    ).transform_calculate(
        week_count_min='min(datum.week_count)',
        week_count_max='max(datum.week_count)'
    ).mark_area(fill='lightgray', opacity=.35, strokeJoin='round', strokeWidth=1, stroke='lightgray', strokeOpacity=0.35).encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('max(week_count):Q', title=''),
        alt.Y2('min(week_count):Q', title='')
    )

    d_week_historic_mean = d_base.transform_filter(
        'datum.year!=2020 && datum.year!=2016'
    ).transform_joinaggregate(
        week_count='sum(count)',
        groupby=['week','year']
    ).mark_line(color='gray', opacity=0.25, strokeJoin='round').encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('mean(week_count):Q', title='')
    )

    d_week_current_line = d_base.transform_filter(
        'datum.year==2020 & datum.week<'+week_cut_off
    ).transform_joinaggregate(
        week_count='sum(count)',
        groupby=['week','year']
    ).mark_line(strokeJoin='round').encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('week_count:Q', title='', axis=alt.Axis(domainOpacity=0,ticks=False))
    )

    d_week_point = d_base.transform_filter(
        'datum.year==2020 & datum.week<'+week_cut_off
    ).mark_circle(color='gray', opacity=0, size=150).encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('sum(count):Q', scale=alt.Scale(zero=True), title=''),
        tooltip=[
            alt.Tooltip('week:O', title="Kalenderwoche"),
            alt.Tooltip('sum(count):Q', title="Todesanzeigen"),
            alt.Tooltip('outlier_score:Q', title="Outlier-Score", format='.2%')
        ]
    )
    
    d_week_outlier = d_base.transform_filter(
        'datum.year==2020 & datum.week<'+week_cut_off+' & datum.outlier_score>='+outlier_threshold
    ).transform_joinaggregate(
        week_count='sum(count)',
        outlier_score_max='max(outlier_score)',
        groupby=['week','year']
    ).mark_point(color='red', opacity=.4, size=60, strokeWidth=3).encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('week_count:Q', scale=alt.Scale(zero=True), title='')
    )

    return (
        d_week_historic_area +
        d_week_historic_mean +
        d_week_current_line +
        d_week_outlier +
        d_week_point
    ).facet(
        facet=alt.Facet('municipaly:N', title=None, header=alt.Header(
            labelAnchor='start',
            labelOrient='bottom',
            labelPadding=0
        )),
        columns=5,
        padding=0,
        title={
          "text": ["Todesanzeigen im Bezirk "+district+" pro Gemeinde pro Woche im Jahresverlauf"], 
          "subtitle": [
    #               "Durchschnitt pro Monat in Hellblau",
              "Quelle: https://github.com/walterra/covid-19-tirol-ds"
          ],
          "color": "black",
          "subtitleColor": "gray"
        },
    ).configure_axis(
        grid=True,
        gridColor="#eee",
        domainColor="#ddd",
        tickColor="#ddd",
        labelColor="gray",
        labelBound=True,
    ).configure_view(
        strokeWidth=0
    ).configure_title(
        fontSize=14,
        fontWeight='bold',
        anchor='start',
        color='gray'
    )

d_chart_3_legendDomain = [
    'Todesanzeigen 2020',
    'Todesanzeigen 2017-2019 min/max',
    'Todesanzeigen 2017-2019 mean'
]
d_chart_3_legendColors = ['#1f77b4', 'lightgray', 'gray']
d_chart_3_legendData = pd.DataFrame({
    'label': d_chart_3_legendDomain
})


d_legend3 = alt.Chart(d_chart_3_legendData).mark_square(size=150).encode(
    y=alt.Y(
        'label:N',
        axis=alt.Axis(domain=False, ticks=False, orient='right'),
        title=None
    ),
    color=alt.Color(
        'label',
        scale=alt.Scale(domain=d_chart_3_legendDomain,range=d_chart_3_legendColors),
        legend=None
    )
)

vega_week_municipaly = municipaly_chart('Landeck')

vega_week_municipaly
Out[12]:
In [13]:
# save chart_1 as formatted JSON
with open("../docs/data/vega_week.json", "w") as jsonFile:
    json.dump(json.loads(vega_week.to_json()), jsonFile, indent=4, sort_keys=True)
In [14]:
# save chart_1 as image
vega_week.save("../assets/vega_week.png")
In [15]:
# save chart_1 as formatted JSON
with open("../docs/data/vega_week_district.json", "w") as jsonFile:
    json.dump(json.loads(vega_week_district.to_json()), jsonFile, indent=4, sort_keys=True)
In [16]:
# save chart_2 as image
vega_week_district.save("../assets/vega_week_district.png")
In [17]:
# save chart_3 as image
vega_week_municipaly.save("../assets/vega_week_municipaly.png")
In [18]:
districts = {
    "imst": "Imst",
    "innsbruck-land": "Innsbruck-Land",
    "innsbruck-stadt": "Innsbruck-Stadt",
    "kitzbuehel": "Kitzbühel",
    "kufstein": "Kufstein",
    "landeck": "Landeck",
    "reutte": "Reutte",
    "schwaz": "Schwaz",
    "lienz": "Lienz"
}

for d in districts:
    chart = municipaly_chart(districts[d])
    with open("../docs/data/vega_" + d + ".json", "w") as jsonFile:
        json.dump(json.loads(chart.to_json()), jsonFile, indent=4, sort_keys=True)